R Setup

library(tidyverse)
library(magrittr)
library(edgeR)
library(AnnotationHub)
library(scales)
library(pander)
library(ggrepel)
library(grid)
library(EDASeq)
library(multcomp)
library(org.Dr.eg.db)
library(org.Hs.eg.db)
library(RUVSeq)
library(gganimate)
theme_set(theme_bw())
panderOptions("table.split.table", Inf)
panderOptions("big.mark", ",")
panderOptions("table.style", "rmarkdown")
if (interactive()) setwd(here::here("R"))

Metadata setup

metaData <- data.frame(
  sample, sex, family, mutation, provider, platform, readLength, oldName
  )
metaData %>% pander()
sample sex family mutation provider platform readLength oldName
2016Q96K97_Wt1 F 2016Q96K97 Q96K97del ACRF Hiseq? 150 1_non_mutant_Q96_K97del_6mth_10_03_2016_S1_fem
2016Q96K97_Wt2 F 2016Q96K97 Q96K97del ACRF Hiseq? 150 2_non_mutant_Q96_K97del_6mth_10_03_2016_S2_fem
2016Q96K97_Wt3 F 2016Q96K97 Q96K97del ACRF Hiseq? 150 3_non_mutant_Q96_K97del_6mth_10_03_2016_S3_fem
2016K97Gfs_Wt1 F 2016K97Gfs K97Gfs ACRF Hiseq? 150 4_non_mutant_K97Gfs_6mth_10_03_2016_S1_fem
2016K97Gfs_Wt2 F 2016K97Gfs K97Gfs ACRF Hiseq? 100?? 5_non_mutant_K97Gfs_6mth_10_03_2016_S2_fem
2016K97Gfs_Wt3 F 2016K97Gfs K97Gfs ACRF Hiseq? 150 6_non_mutant_K97Gfs_6mth_10_03_2016_S3_fem
2016PSEN2_Wt1 F 2016PSEN2 S4Ter ACRF Hiseq? 150 9_Ps2Ex3M1_WT_6month_07_07_2016_F3_79_Fem
2016PSEN2_Wt2 F 2016PSEN2 S4Ter ACRF Hiseq? 150 10_Ps2Ex3M1_WT_6month_07_07_2016_F3_86_Fem
2016PSEN2_Wt3 F 2016PSEN2 S4Ter ACRF Hiseq? 150 11_Ps2Ex3M1_WT_6month_07_07_2016_F3_88_Fem
2016PSEN2_Wt4 F 2016PSEN2 S4Ter ACRF Hiseq? 150 12_Ps2Ex3M1_WT_6month_07_07_2016_F3_95_Fem
2017Q96K97_Wt1 M 2017Q96K97 Q96K97del SAHMRI Nextseq 75 1-MORGAN-6P-PN1_S2_R1_001
2017Q96K97_Wt2 F 2017Q96K97 Q96K97del SAHMRI Nextseq 75 2-MORGAN-6P-PN2_S5_R1_001
2017Q96K97_Wt3 F 2017Q96K97 Q96K97del SAHMRI Nextseq 75 3-MORGAN-6P-PN3_S10_R1_001
2017Q96K97_Wt4 F 2017Q96K97 Q96K97del SAHMRI Nextseq 75 4-MORGAN-6P-PN4_S8_R1_001
2018PSEN2_Wt1 F 2018PSEN2 FSvsFAD SAHMRI Nextseq 75 11__-_5
2018PSEN2_Wt2 F 2018PSEN2 FSvsFAD SAHMRI Nextseq 75 12__-_4
2018PSEN2_Wt3 F 2018PSEN2 FSvsFAD SAHMRI Nextseq 75 13__-_3
2018PSEN2_Wt4 M 2018PSEN2 FSvsFAD SAHMRI Nextseq 75 14__-_2
2018PSEN2_Wt5 M 2018PSEN2 FSvsFAD SAHMRI Nextseq 75 15__-_1
2019Q96K97_Wt1 F 2019Q96K97 Q96K97del Novogene Novaseq 150 W1
2019Q96K97_Wt2 F 2019Q96K97 Q96K97del Novogene Novaseq 150 W2
2019Q96K97_Wt3 F 2019Q96K97 Q96K97del Novogene Novaseq 150 W3
2019Q96K97_Wt4 F 2019Q96K97 Q96K97del Novogene Novaseq 150 W4
2019Q96K97_Wt5 F 2019Q96K97 Q96K97del Novogene Novaseq 150 W5
2019SORL1_Wt1 F 2019SORL1 W1818X Novogene Novaseq 150 3_13W_1
2019SORL1_Wt2 F 2019SORL1 W1818X Novogene Novaseq 150 4_14W_1
2019SORL1_Wt3 M 2019SORL1 W1818X Novogene Novaseq 150 6_16W_1
2019SORL1_Wt4 M 2019SORL1 W1818X Novogene Novaseq 150 8_19W_1
2019SORL1_Wt5 M 2019SORL1 W1818X Novogene Novaseq 150 9_1W_1
2019SORL1_Wt6 F 2019SORL1 W1818X Novogene Novaseq 150 12_9W_1

Data setup

Load data

counts1 <- read_tsv("../2_alignedData/featureCounts/genes.out") %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.),"Aligned.sortedByCoord.out.bam")) %>%
  set_colnames(str_remove(colnames(.),".fq.gz"))
counts2 <- read_tsv("../../20190122_Q96K97_NoStress_RNASeq/2_alignedData/featureCounts/genes_ss.out") %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "_ssAligned.sortedByCoord.out.bam")) %>%
  dplyr::select(Geneid, starts_with("W"))
counts <- counts1 %>%
  full_join(counts2) %>% 
  column_to_rownames("Geneid") %>%
  set_colnames(metaData$sample[match(colnames(.), metaData$oldName)]) %>%
  rownames_to_column("Geneid")

Create DGE List

dgeList <- counts %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  DGEList() %>%
  calcNormFactors()
# Set group variables
dgeList$samples$group <- colnames(dgeList) %>%
  str_extract("201\\d(Q96K97|K97Gfs|PSEN2|SORL1)") %>%
  factor(
    levels = c(
      "2016Q96K97", "2016K97Gfs", "2016PSEN2", "2017Q96K97",
      "2018PSEN2", "2019Q96K97", "2019SORL1"
    )
  )
dgeList$samples$sample <- colnames(dgeList)

Add gene information

# Add AnnotationHub and subset to search for zebrafish
ah <- AnnotationHub()
ah %>%
  subset(species == "Danio rerio") %>%
  subset(dataprovider == "Ensembl") %>%
  subset(rdataclass == "EnsDb")
## AnnotationHub with 11 records
## # snapshotDate(): 2019-05-02 
## # $dataprovider: Ensembl
## # $species: Danio rerio
## # $rdataclass: EnsDb
## # additional mcols(): taxonomyid, genome, description,
## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## #   tags, rdatapath, sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH53201"]]' 
## 
##             title                           
##   AH53201 | Ensembl 87 EnsDb for Danio Rerio
##   AH53705 | Ensembl 88 EnsDb for Danio Rerio
##   AH56671 | Ensembl 89 EnsDb for Danio Rerio
##   AH57746 | Ensembl 90 EnsDb for Danio Rerio
##   AH60762 | Ensembl 91 EnsDb for Danio Rerio
##   ...       ...                             
##   AH64434 | Ensembl 93 EnsDb for Danio Rerio
##   AH64906 | Ensembl 94 EnsDb for Danio rerio
##   AH67932 | Ensembl 95 EnsDb for Danio rerio
##   AH69169 | Ensembl 96 EnsDb for Danio rerio
##   AH73861 | Ensembl 97 EnsDb for Danio rerio
# Select correct Ensembl release
ensDb <- ah[["AH69169"]]
# Extract GenomicRanges object from ensDb
genesGR <- genes(ensDb)
# Remove redundant columns from mcols
mcols(genesGR) <- mcols(genesGR)[c("gene_id", "gene_name", 
                                   "gene_biotype", "entrezid")]
# Add genesGR to DGEList using rownames of DGEList to reorder the genesGR
dgeList$genes <- genesGR[rownames(dgeList),]

Data QC

# Perform logical test to see how many genes were not detected in dataset
dgeList$counts %>%
  rowSums() %>%
  is_greater_than(0) %>%
  table()
## .
## FALSE  TRUE 
##  3840 28217
# Check for genes having >= 3 samples with cpm > 1
dgeList %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(3) %>%
  table()
## .
## FALSE  TRUE 
## 12525 19532
# Create logical vector of genes to keep that fit criteria
genes2keep <- dgeList %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(3)
# Create new DGEList of genes fitting criteria
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE] %>%
  calcNormFactors()
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeList %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "Before Filtering")
dgeFilt %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "After Filtering")

par(mfrow = c(1,1))

Library size check

# Check library sizes with box plot
dgeFilt$samples %>%
  ggplot(aes(group, lib.size, fill = group)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  labs(x = "Family", y = "Library Size") +
  theme(legend.position = "none")

PCA

Setup

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pca <- dgeFilt %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pca)$importance %>% pander()
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
Standard deviation 61.42 46.8 34.09 22.69 21.84 19.87 16.37 14.65 13.58 13.27 12.15 11.34 10.68 10.14 10.05 9.823 9.487 9.19 8.942 8.821 8.754 8.56 8.467 8.128 7.975 7.883 7.597 7.263 6.84 8.673e-14
Proportion of Variance 0.3446 0.2001 0.1061 0.04702 0.04357 0.03606 0.02448 0.0196 0.01685 0.01609 0.01348 0.01175 0.01042 0.00939 0.00923 0.00881 0.00822 0.00771 0.0073 0.00711 0.007 0.00669 0.00655 0.00603 0.00581 0.00568 0.00527 0.00482 0.00427 0
Cumulative Proportion 0.3446 0.5446 0.6508 0.6978 0.7414 0.7774 0.8019 0.8215 0.8384 0.8545 0.8679 0.8797 0.8901 0.8995 0.9087 0.9175 0.9258 0.9335 0.9408 0.9479 0.9549 0.9616 0.9681 0.9741 0.98 0.9856 0.9909 0.9957 1 1
# Create function for plotting PCA
ggPCA <- function(x, y){
  pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, one_of(c(x, y))) %>%
    left_join(dgeFilt$samples) %>%
    left_join(metaData) %>%
    ggplot(
      aes_string(
        x, y, colour = "group", shape = "provider", label = "sex"
      )
    ) +
    geom_point(size = 3, stroke = 0.5) +
    # geom_text_repel(show.legend = FALSE) +
    labs(colour = "Family", shape = "Provider") +
    xlab(label = paste0(x, " (", percent(summary(pca)$importance[2, x]), ")")) +
    ylab(label = paste0(y, " (", percent(summary(pca)$importance[2, y]), ")")) +
    theme_bw()
}

Plots

ggPCA("PC1", "PC2")

# Set up grid for visualisation of 3 PCA's at once.
vp1 <- viewport(x = 0.25, y = 0.75, width = 0.5, height = 0.5)
vp2 <- viewport(x = 0.75, y = 0.75, width = 0.5, height = 0.5)
vp3 <- viewport(x = 0.25, y = 0.25, width = 0.5, height = 0.5)
# Plot PCA plots in grid
if (interactive()) grid::grid.newpage()
print(
  ggPCA("PC2", "PC3") +
    theme(legend.position = "none"),
  vp = vp1
)
print(
  ggPCA("PC3", "PC4") +
    theme(legend.position = "none"),
  vp = vp2
)
print(
  ggPCA("PC4", "PC5") +
    guides(
      colour = guide_legend(ncol = 2),
      shape = guide_legend(ncol = 3)
    ) +
    theme(legend.position = c(1.6, 0.5)),
  vp = vp3
)

Differential Expression

Gene labeller

# Set up gene labeller for easy conversion
geneLabeller <- as_labeller(
  structure(genesGR$gene_name, names = genesGR$gene_id)
)

Voom

voomData <- voom(dgeFilt, design = matrix(1, nrow = ncol(dgeFilt)))
pcaDesign <- cbind(Intercept = 1, pca$x[,1:6])
pcaFit <- lmFit(voomData, pcaDesign) %>%
  eBayes
# topTable of PC1
topTablePC1 <- topTable(pcaFit, coef = "PC1", n = Inf) %>%
  as_tibble() %>% 
  set_colnames(str_remove_all(colnames(.), "ID.")) %>%
  unite("Range", start, end, sep = "-") %>%
  unite("Location", seqnames, Range, strand, sep = ":") %>%
  dplyr::select(
    gene_id, gene_name, AveExpr, logFC, 
    P.Value, adj.P.Val, t, Location, entrezid
  )

Just grab one or two to test:

voomData$E[rownames(topTable(pcaFit, coef = "PC1", n = Inf))[1:10],] %>%
  t() %>%
  as.data.frame(stringsAsFactors = FALSE) %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  gather("gene_id", "logCPM", -sample) %>%
  left_join(voomData$targets) %>%
  left_join(metaData, by = "sample") %>%
  cbind(pca$x[.$sample,]) %>%
  ggplot(aes(PC1, logCPM, colour = group, shape = provider, label = sex)) +
  geom_point() +
  facet_wrap(~gene_id, labeller = geneLabeller)

Gene length and GC content

Setup

# # This is extremely slow, takes approximately 30 minutes.
# # Saved as .rds file to save time
# id <- rownames(dgeFilt)
# gcContent <- getGeneLengthAndGCContent(id, org = "dre", mode = "biomart")
# saveRDS(gcContent, "../files/gcContent.rds")
geneGCLen <- readRDS("../files/gcContent.rds") %>%
  as.data.frame() %>%
  rownames_to_column("gene_id") %>%
  as_tibble()
# Get transcript lengths and choose longest transcript to be gene length
transLen <- transcriptLengths(ensDb) %>% as_tibble()
geneLen <- transLen %>%
  arrange(gene_id, desc(tx_len)) %>%
  distinct(gene_id, .keep_all = TRUE)

Gene Length

Initial inspection

topTablePC1 %>% 
  mutate(rank = seq_len(nrow(.))) %>% 
  left_join(geneGCLen) %>% 
  ggplot(aes(rank, length)) + 
  geom_point() + 
  scale_y_log10() +
  geom_smooth(method = "lm", se = FALSE)

topTablePC1 %>%
  mutate(rank = seq_len(nrow(.))) %>% 
  left_join(geneGCLen) %>%
  ggplot(aes(t, length)) +
  geom_point() +
  scale_y_log10() +
  geom_smooth(method = "lm", se = FALSE)

Length comparison

# Grab DE down genes and join lengths
# Outliers were also removed to make boxplots look nicer (only 7 genes)
deDown <- topTablePC1 %>% 
  dplyr::filter(adj.P.Val < 0.05, logFC < 0) %>%
  dplyr::select(gene_id, gene_name) %>%
  dplyr::mutate(group = "down")
# Grab DE up genes and join lengths
deUp <- topTablePC1 %>% 
  dplyr::filter(adj.P.Val < 0.05, logFC > 0) %>%
  dplyr::select(gene_id, gene_name) %>%
  dplyr::mutate(group = "up")
# Grab non DE genes and join lengths
deNon <- topTablePC1 %>% 
  dplyr::filter(adj.P.Val > 0.05) %>%
  dplyr::select(gene_id, gene_name) %>%
  dplyr::mutate(group = "non")
# Join into one tibble and box plot to compare
deDown %>%
  full_join(deUp) %>%
  full_join(deNon) %>%
  left_join(geneGCLen) %>%
  ggplot(aes(group, length, fill = group)) +
  geom_boxplot(outlier.shape = NA) +
  scale_y_continuous(labels = comma) +
  labs(x = "DE", y = "Gene Length") +
  coord_cartesian(ylim = c(0, 10000)) +
  theme(legend.position = "none") 

LogFC vs length

topTablePC1 %>%
  left_join(geneGCLen) %>%
  mutate(logFC = logFC^(1/3)) %>%
  ggplot(aes(length, logFC, colour = gc)) +
  geom_point(alpha = 0.5) +
  scale_color_gradient2(low = "black", mid = "red", high = "yellow") +
  scale_x_log10(labels = comma) +
  xlab("Length (bp)") +
  ylab("LogFC")

GC content

GC content comparison

deDown %>%
  full_join(deUp) %>%
  full_join(deNon) %>% 
  left_join(geneGCLen) %>%
  ggplot(aes(group, gc, fill = group)) +
  geom_boxplot() +
  labs(x = "DE", y = "GC Content") +
  theme(legend.position = "none")

LogFC vs GC content

topTablePC1 %>%
  left_join(geneGCLen) %>%
  ggplot(aes(gc, logFC)) +
  geom_point() +
  scale_x_continuous(label = scales::percent_format(accuracy = 1)) +
  xlab("GC content") +
  ylab("LogFC")

Gene level variance

# Create function for isolating columns per family and calculating variance
famVar <- function(x){
  dgeFilt %>%
    cpm() %>%
    as.data.frame() %>%
    dplyr::select(str_subset(colnames(.), paste0(x, "_Wt\\d"))) %>%
    apply(1, var) %>%
    as_tibble(rownames = "geneid") %>%
    dplyr::mutate(group = x)
}
# Calculate variance for each family
var1 <- famVar("2016Q96K97")
var2 <- famVar("2016K97Gfs") 
var3 <- famVar("2016PSEN2")
var4 <- famVar("2017Q96K97")
var5 <- famVar("2018PSEN2")
var6 <- famVar("2019Q96K97")
var7 <- famVar("2019SORL1")
# Plot boxplot
var1 %>% 
  full_join(var2) %>%
  full_join(var3) %>%
  full_join(var4) %>%
  full_join(var5) %>%
  full_join(var6) %>%
  full_join(var7) %>%
  ggplot(aes(group, value, fill = group)) +
  geom_boxplot(outlier.shape = NA) +
  labs(x = "Family", y = "Gene level variance", fill = "Provider") +
  scale_fill_discrete(
    labels = c(
      "ACRF", "ACRF", "ACRF", "SAHMRI", "SAHMRI", "Novogene", "Novogene"
    )
  ) +
  coord_cartesian(ylim = c(0, 75))

GO analysis

ID conversion

# # Some packages are required to be unloaded before biomaRt can be loaded
# detach("package:EDASeq", unload = TRUE)
# detach("package:ensembldb", unload = TRUE)
# detach("package:GenomicFeatures", unload = TRUE)
# library(biomaRt)
# # Human entrez and ensembl IDs
# humanEntrezEns <- org.Hs.egENSEMBL %>%
#   as.data.frame %>%
#   set_colnames(c("Hs_entrez", "Hs_ensembl"))
# # Zebrafish ensembl IDs
# zebEns <- org.Dr.egENSEMBL %>%
#   as.data.frame %>%
#   set_colnames(c("Dr_entrez", "Dr_ensembl"))
# # Create a data.frame to map between human entrezgenes and zebrafish ensembl IDs.
# # BioMart only includes homolog mappings for ensembl IDs which is why we need to 
# # retrieve human ensembl IDs, then join to the humanEntrezEns data.frame,
# # in order to get the desired human entrezgenes to zebrafish ensembl ID mapping. 
# zebMart <- useMart("ENSEMBL_MART_ENSEMBL", "drerio_gene_ensembl")
# getFromBiomart <- c("ensembl_gene_id", "hsapiens_homolog_ensembl_gene")
# ens2entrez <- getBM(
#   getFromBiomart,
#   values = unique(zebEns$Dr_ensembl),
#   mart = zebMart
# ) %>%
#   set_colnames(c("Dr_ensembl", "Hs_ensembl")) %>%
#   left_join(humanEntrezEns, by = "Hs_ensembl") %>%
#   dplyr::select(-Hs_ensembl) %>% 
#   dplyr::filter(complete.cases(.)) %>%
#   as_tibble()

GO enrichment

# de <- topTablePC1 %>%
#   dplyr::filter(adj.P.Val < 0.05) %>%
#   dplyr::select(gene_id) %>%
#   left_join(ens2entrez, by = c("gene_id" = "Dr_ensembl")) %>%
#   dplyr::filter(!is.na(Hs_entrez)) %>%
#   .[["Hs_entrez"]] %>%
#   unique()
# uv <- topTablePC1 %>%
#   dplyr::select(gene_id) %>%
#   left_join(ens2entrez, by = c("gene_id" = "Dr_ensembl")) %>%
#   dplyr::filter(!is.na(Hs_entrez)) %>%
#   .[["Hs_entrez"]] %>%
#   unique()
# goResults <- goana(de = de, universe = uv, species = "Hs")
# goResults %>% 
#   rownames_to_column("GO ID") %>%
#   as_tibble() %>%
#   dplyr::filter(DE > 1) %>%
#   dplyr::arrange(P.DE) %>%
#   mutate(FDR = p.adjust(P.DE, "fdr")) %>%
#   dplyr::filter(FDR < 0.05) %>%
#   mutate(`GO ID` = str_replace(`GO ID`, ":", "\\\\:")) %>%
#   pander(justify = "llrrrrr")

Steve’s wizardry

Initial inspection

geneGCLen %>%
  cbind(pca$rotation[.$gene_id,]) %>% 
  as_tibble() %>%
  ggplot(aes(length, PC1, colour = gc)) +
  geom_point() +
  scale_x_log10(labels = comma) +
  scale_colour_viridis_c(option = "inferno")

Bin setup

# Create table breaking GC content and gene length into bins with approximately 
# equal amounts of genes in each bin, along with their contribution to each 
# principal component axis.
nBins <- list(length = 20, gc = 20)
binTable <- geneGCLen %>%
  cbind(pca$rotation[.$gene_id,]) %>% 
  as_tibble() %>%
  mutate(
    lengthBins = cut(
      log(length), 
      # breaks = nBins$length,
      breaks = quantile(
        log(length), seq(0, nBins$length)/nBins$length
      ),
      labels = paste0("L", seq_len(nBins$length)), 
      include.lowest = TRUE
    ),
    gcBins = cut(
      gc, 
      # breaks = nBins$gc,
      breaks = quantile(
        gc, seq(0, nBins$gc) / nBins$gc
      ),
      labels = paste0("GC", seq_len(nBins$gc)), 
      include.lowest = TRUE
    ),
    bothBins = paste(lengthBins, gcBins, sep = "_"),
    bothBins = as.factor(bothBins)
  ) %>%
  group_by(bothBins) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  dplyr::filter(n > 1) %>%
  as_tibble()
# Setup functions to get ranges of bins.
minLen <- function(x){
  binTable %>% 
    dplyr::filter(lengthBins == x) %>%
    dplyr::select(length) %>% 
    min()
}
maxLen <- function(x){
  binTable %>% 
    dplyr::filter(lengthBins == x) %>%
    dplyr::select(length) %>% 
    max()
}
minGC <- function(x){
  binTable %>% 
    dplyr::filter(gcBins == x) %>%
    dplyr::select(gc) %>% 
    min()
}
maxGC <- function(x){
  binTable %>% 
    dplyr::filter(gcBins == x) %>%
    dplyr::select(gc) %>% 
    max()
}
# Put ranges of bins in table.
binRanges <- tibble(
  Bin = c(1:20),
  minLen = c(minLen("L1"), minLen("L2"), minLen("L3"), minLen("L4"), 
             minLen("L5"), minLen("L6"), minLen("L7"), minLen("L8"),
             minLen("L9"), minLen("L10"), minLen("L11"), minLen("L12"), 
             minLen("L13"), minLen("L14"), minLen("L15"), minLen("L16"), 
             minLen("L17"), minLen("L18"), minLen("L19"), minLen("L20")),
  maxLen = c(maxLen("L1"), maxLen("L2"), maxLen("L3"), maxLen("L4"), 
             maxLen("L5"), maxLen("L6"), maxLen("L7"), maxLen("L8"),
             maxLen("L9"), maxLen("L10"), maxLen("L11"), maxLen("L12"), 
             maxLen("L13"), maxLen("L14"), maxLen("L15"), maxLen("L16"), 
             maxLen("L17"), maxLen("L18"), maxLen("L19"), maxLen("L20")),
  minGC = c(minGC("GC1"), minGC("GC2"), minGC("GC3"), minGC("GC4"), 
            minGC("GC5"), minGC("GC6"), minGC("GC7"), minGC("GC8"),
            minGC("GC9"), minGC("GC10"), minGC("GC11"), minGC("GC12"), 
            minGC("GC13"), minGC("GC14"), minGC("GC15"), minGC("GC16"), 
            minGC("GC17"), minGC("GC18"), minGC("GC19"), minGC("GC20")),
  maxGC = c(maxGC("GC1"), maxGC("GC2"), maxGC("GC3"), maxGC("GC4"), 
            maxGC("GC5"), maxGC("GC6"), maxGC("GC7"), maxGC("GC8"),
            maxGC("GC9"), maxGC("GC10"), maxGC("GC11"), maxGC("GC12"), 
            maxGC("GC13"), maxGC("GC14"), maxGC("GC15"), maxGC("GC16"), 
            maxGC("GC17"), maxGC("GC18"), maxGC("GC19"), maxGC("GC20"))
  ) %>%
  mutate(
    minGC = round(100*minGC, digits = 0),
    maxGC = round(100*maxGC, digits = 0),
    minLen = round(minLen / 1000, digits = 1),
    maxLen = round(maxLen / 1000, digits = 1)
  ) %>%
  unite("Length", minLen, maxLen, sep = "-") %>%
  unite("GC_Content", minGC, maxGC, sep = "-")
binRanges
## # A tibble: 20 x 3
##      Bin Length   GC_Content
##    <int> <chr>    <chr>     
##  1     1 0.1-0.8  22-33     
##  2     2 0.8-1    33-34     
##  3     3 1-1.2    34-35     
##  4     4 1.2-1.4  35-36     
##  5     5 1.4-1.6  36-37     
##  6     6 1.6-1.8  37-37     
##  7     7 1.8-1.9  37-38     
##  8     8 1.9-2.1  38-39     
##  9     9 2.1-2.3  39-40     
## 10    10 2.3-2.5  40-41     
## 11    11 2.5-2.7  41-42     
## 12    12 2.7-3    42-42     
## 13    13 3-3.3    42-43     
## 14    14 3.3-3.6  43-44     
## 15    15 3.6-4.1  44-46     
## 16    16 4.1-4.6  46-47     
## 17    17 4.6-5.3  47-48     
## 18    18 5.3-6.1  48-50     
## 19    19 6.1-7.5  50-52     
## 20    20 7.6-98.6 52-67

PC1

# Fit linear model to explain PC1 by GC content and gene length.
lm_PC1 <- binTable %>%  
  lm(PC1 ~ 0 + bothBins, data = .)
# FDR adjust for multiple testing.
# Add significance column for filtering insignificant bins.
binFDR_PC1 <- summary(lm_PC1)$coef %>%
  as.data.frame() %>% 
  rownames_to_column("Bin") %>% 
  as_tibble() %>%
  mutate(FDR = p.adjust(`Pr(>|t|)`, "fdr")) %>%
  mutate(Bin = str_remove(Bin, "bothBins")) %>% 
  separate(Bin, into = c("length", "GC")) %>%
  dplyr::mutate(
    sig = FDR < 0.05,
    p = `Pr(>|t|)`
  ) %>%
  dplyr::select(length, GC, sig, FDR, p)
# Plot gene length bins vs GC contents bins showing contribution to PC1
# and significance.
coef(lm_PC1) %>% 
  enframe() %>%
  mutate(name = str_remove(name, "bothBins")) %>% 
  separate(name, into = c("length", "GC")) %>%
  left_join(binFDR_PC1) %>%
  dplyr::filter(sig) %>%
  mutate(
    length = str_remove(length, "L") %>% as.integer(),
    GC = str_remove(GC, "GC") %>% as.integer()
  ) %>%
  ggplot(aes(length, GC)) +
  geom_point(aes(colour = value, size = -log10(p))) +
  scale_x_continuous(
    breaks = seq_len(nBins$length),
    labels = binRanges$Length
  ) +
  scale_y_continuous(
    breaks = seq_len(nBins$gc),
    labels = binRanges$GC_Content
  ) +
  scale_size_continuous(guide = FALSE) +
  labs(
    x = "Gene Length (kb)",
    y = "GC Content (%)",
    colour = paste("Contribution", "to PC1", sep = "\n"),
    size = "Significance"
  ) +
  theme(
    panel.grid.minor = element_blank(),
    axis.text.x = element_text(
      angle = -45, hjust = 0.1, vjust = 0.5
    )
  )

PC2

# Fit linear model to explain PC2 by GC content and gene length.
lm_PC2 <- binTable %>%  
  lm(PC2 ~ 0 + bothBins, data = .)
# FDR adjust for multiple testing.
# Add significance column for filtering insignificant bins.
binFDR_PC2 <- summary(lm_PC2)$coef %>%
  as.data.frame() %>% 
  rownames_to_column("Bin") %>% 
  as_tibble() %>%
  mutate(FDR = p.adjust(`Pr(>|t|)`, "fdr")) %>%
  mutate(Bin = str_remove(Bin, "bothBins")) %>% 
  separate(Bin, into = c("length", "GC")) %>%
  dplyr::mutate(
    sig = FDR < 0.05,
    p = `Pr(>|t|)`
  ) %>%
  dplyr::select(length, GC, sig, FDR, p)
# Plot gene length bins vs GC contents bins showing contribution to PC2
# and significance.
coef(lm_PC2) %>% 
  enframe() %>%
  mutate(name = str_remove(name, "bothBins")) %>% 
  separate(name, into = c("length", "GC")) %>%
  left_join(binFDR_PC2) %>%
  dplyr::filter(sig) %>%
  mutate(
    length = str_remove(length, "L") %>% as.integer(),
    GC = str_remove(GC, "GC") %>% as.integer()
  ) %>%
  ggplot(aes(length, GC)) +
  geom_point(aes(colour = value, size = -log10(p))) +
  scale_x_continuous(
    breaks = seq_len(nBins$length),
    labels = binRanges$Length
  ) +
  scale_y_continuous(
    breaks = seq_len(nBins$gc),
    labels = binRanges$GC_Content
  ) +
  scale_size_continuous(guide = FALSE) +
  labs(
    x = "Gene Length (kb)",
    y = "GC Content (%)",
    colour = paste("Contribution", "to PC2", sep = "\n"),
    size = "Significance"
  ) +
  theme(
    panel.grid.minor = element_blank(),
    axis.text.x = element_text(
      angle = -45, hjust = 0.1, vjust = 0.5
    )
  )

Old POA

# myLmPlot <- function(x){
#   par(mfrow = c(2,2))
#   plot(x)
#   par(mfrow = c(1,1))
# }

We ran an initial model plotting gene-level contributions (i.e. rotations) to PC1 against the two potential predictor variables: log10(length) and gc content.

# geneGCLen %>%
#   cbind(pca$rotation[.$gene_id,]) %>% 
#   as_tibble() %>%
#   lm(PC1 ~ log10(length)*gc, data = .) %T>%
#   myLmPlot() %>%
#   summary() %>%
#   pander()

This model clearly violated the assumption of constant variance meaning the basic assumptions for the linear model were not able to be applied. As such, the Box-Cox transformation was applied using an offset of \(\gamma = 1\) and with \(\lambda = 38\), as determined by fitting the profile likelihood for \(\lambda\).

# geneGCLen %>%
#   cbind(pca$rotation[.$gene_id,]) %>% 
#   as_tibble() %>%
#   with(
#     boxCox((1 + PC1) ~ log(length)*gc,
#            lambda = seq(0, 50, length = 50)
#            )
#     )

The transformed data was then fit and the assumption of constant variance was able to be applied.

# geneGCLen %>%
#   cbind(pca$rotation[.$gene_id,]) %>% 
#   as_tibble() %>%
#   mutate(
#     bcPC1 = bcnPower(PC1, lambda = 38, gamma = 1)
#   ) %>%
#   lm(bcPC1 ~ log10(length)*gc, data = .) %T>%
#   myLmPlot() %>%
#   summary() %>%
#   pander()

Whilst not being simply interpretable, this suggests that both GC content and gene length are contributing to the variability detected along PC1.

# geneGCLen %>%
#   cbind(pca$rotation[.$gene_id,]) %>% 
#   as_tibble() %>%
#   ggplot(aes(length, PC1, colour = gc)) +
#   geom_point() +
#   scale_x_log10(labels = comma) +
#   scale_colour_viridis_c(option = "inferno")

RUVSeq

Negative controls

# Extract subset of low ranked genes from DE analysis to act as negative 
# controls for RUVg procedure
negControl <- topTablePC1 %>% 
  dplyr::arrange(desc(P.Value)) %>%
  .[1:10000,] %>%
  .$gene_id

k = 1

# Run RUVSeq for k = 1
RUVg_k1 <- RUVg(dgeFilt$counts, negControl, 1)

PCA

# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalse_k1 <- dgeFilt
# Replace with results
dgeFalse_k1$counts <- RUVg_k1$normalizedCounts
# Run PCA function
pcaRUVg_k1 <- dgeFalse_k1 %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVg_k1)$importance %>% pander()
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
Standard deviation 72.83 58.38 34.92 31.2 22.51 20.29 18.02 16.1 14.5 13.32 12.75 11.73 11.32 10.51 10.07 9.816 9.607 9.404 9.043 8.955 8.695 8.467 8.41 8.167 7.929 7.774 7.628 7.268 6.667 8.439e-14
Proportion of Variance 0.3673 0.2361 0.08448 0.06742 0.03509 0.02852 0.02248 0.01794 0.01455 0.0123 0.01127 0.00953 0.00887 0.00765 0.00702 0.00667 0.00639 0.00613 0.00566 0.00555 0.00524 0.00497 0.0049 0.00462 0.00435 0.00419 0.00403 0.00366 0.00308 0
Cumulative Proportion 0.3673 0.6034 0.6879 0.7553 0.7904 0.8189 0.8414 0.8594 0.8739 0.8862 0.8975 0.907 0.9159 0.9235 0.9306 0.9372 0.9436 0.9497 0.9554 0.961 0.9662 0.9712 0.9761 0.9807 0.985 0.9892 0.9933 0.9969 1 1
# Plot PCA
pcaRUVg_k1$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
    left_join(dgeFalse_k1$samples) %>%
    left_join(metaData) %>%
    ggplot(
      aes_string(
        "PC1", "PC2", colour = "group", shape = "provider", label = "sex"
      )
    ) +
    geom_point(size = 3, stroke = 0.5) +
    # geom_text_repel(show.legend = FALSE) +
    labs(colour = "Family", shape = "Provider") +
    xlab(label = paste0("PC1", " (", percent(summary(pcaRUVg_k1)$importance[2, "PC1"]), ")")) +
    ylab(label = paste0("PC2", " (", percent(summary(pcaRUVg_k1)$importance[2, "PC2"]), ")")) +
    theme_bw()

k = 2

# Run RUVSeq for k = 2
RUVg_k2 <- RUVg(dgeFilt$counts, negControl, 2)

PCA

# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalse_k2 <- dgeFilt
# Replace with results
dgeFalse_k2$counts <- RUVg_k2$normalizedCounts
# Run PCA function
pcaRUVg_k2 <- dgeFalse_k2 %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVg_k2)$importance %>% pander()
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
Standard deviation 72.96 57.92 31.88 22.48 20.21 18.08 16.15 14.76 13.4 12.79 11.75 11.36 10.47 10.06 9.808 9.626 9.405 8.965 8.864 8.686 8.445 8.383 8.169 7.961 7.789 7.664 7.291 6.721 2.433 1.006e-13
Proportion of Variance 0.402 0.2533 0.07674 0.03818 0.03084 0.02469 0.01971 0.01645 0.01356 0.01235 0.01042 0.00975 0.00828 0.00765 0.00727 0.007 0.00668 0.00607 0.00593 0.0057 0.00539 0.00531 0.00504 0.00479 0.00458 0.00444 0.00402 0.00341 0.00045 0
Cumulative Proportion 0.402 0.6553 0.7321 0.7702 0.8011 0.8258 0.8455 0.8619 0.8755 0.8879 0.8983 0.908 0.9163 0.9239 0.9312 0.9382 0.9449 0.951 0.9569 0.9626 0.968 0.9733 0.9783 0.9831 0.9877 0.9921 0.9961 0.9996 1 1
# Plot PCA
pcaRUVg_k2$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
    left_join(dgeFalse_k2$samples) %>%
    left_join(metaData) %>%
    ggplot(
      aes_string(
        "PC1", "PC2", colour = "group", shape = "provider", label = "sex"
      )
    ) +
    geom_point(size = 3, stroke = 0.5) +
    # geom_text_repel(show.legend = FALSE) +
    labs(colour = "Family", shape = "Provider") +
    xlab(label = paste0("PC1", " (", percent(summary(pcaRUVg_k2)$importance[2, "PC1"]), ")")) +
    ylab(label = paste0("PC2", " (", percent(summary(pcaRUVg_k2)$importance[2, "PC2"]), ")")) +
    theme_bw()

k = 3

# Run RUVSeq for k = 3
RUVg_k3 <- RUVg(dgeFilt$counts, negControl, 3)

PCA

# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalse_k3 <- dgeFilt
# Replace with results
dgeFalse_k3$counts <- RUVg_k3$normalizedCounts
# Run PCA function
pcaRUVg_k3 <- dgeFalse_k3 %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVg_k3)$importance %>% pander()
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
Standard deviation 73.54 42.11 23.2 20.23 18.56 16.19 15.06 13.27 12.76 11.69 11.41 10.47 10.13 9.877 9.715 9.419 9.028 8.88 8.775 8.567 8.463 8.257 7.995 7.759 7.589 7.437 6.816 1.949 1.884 1.665e-13
Proportion of Variance 0.5007 0.1641 0.04982 0.03788 0.03188 0.02428 0.02099 0.01629 0.01507 0.01266 0.01205 0.01015 0.0095 0.00903 0.00874 0.00821 0.00755 0.0073 0.00713 0.00679 0.00663 0.00631 0.00592 0.00557 0.00533 0.00512 0.0043 0.00035 0.00033 0
Cumulative Proportion 0.5007 0.6648 0.7146 0.7525 0.7844 0.8087 0.8297 0.846 0.861 0.8737 0.8858 0.8959 0.9054 0.9144 0.9232 0.9314 0.9389 0.9462 0.9533 0.9601 0.9668 0.9731 0.979 0.9846 0.9899 0.995 0.9993 0.9997 1 1
# Plot PCA
pcaRUVg_k3$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
    left_join(dgeFalse_k3$samples) %>%
    left_join(metaData) %>%
    ggplot(
      aes_string(
        "PC1", "PC2", colour = "group", shape = "provider", label = "sex"
      )
    ) +
    geom_point(size = 3, stroke = 0.5) +
    # geom_text_repel(show.legend = FALSE) +
    labs(colour = "Family", shape = "Provider") +
    xlab(label = paste0("PC1", " (", percent(summary(pcaRUVg_k3)$importance[2, "PC1"]), ")")) +
    ylab(label = paste0("PC2", " (", percent(summary(pcaRUVg_k3)$importance[2, "PC2"]), ")")) +
    theme_bw()

k = 4

# Run RUVSeq for k = 4
RUVg_k4 <- RUVg(dgeFilt$counts, negControl, 4)

PCA

# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalse_k4 <- dgeFilt
# Replace with results
dgeFalse_k4$counts <- RUVg_k4$normalizedCounts
# Run PCA function
pcaRUVg_k4 <- dgeFalse_k4 %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVg_k4)$importance %>% pander()
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
Standard deviation 73.48 34.98 21.05 20.15 16.16 14.96 13.26 12.76 11.71 11.44 10.47 10.14 9.883 9.714 9.391 9.064 8.893 8.756 8.57 8.499 8.251 8.01 7.79 7.61 7.405 6.78 1.827 1.642 1.551 7.317e-14
Proportion of Variance 0.5509 0.1249 0.04523 0.04145 0.02665 0.02285 0.01795 0.01661 0.01399 0.01336 0.01118 0.01049 0.00997 0.00963 0.009 0.00838 0.00807 0.00782 0.00749 0.00737 0.00695 0.00655 0.00619 0.00591 0.00559 0.00469 0.00034 0.00028 0.00025 0
Cumulative Proportion 0.5509 0.6758 0.721 0.7624 0.7891 0.8119 0.8299 0.8465 0.8605 0.8739 0.885 0.8955 0.9055 0.9151 0.9241 0.9325 0.9406 0.9484 0.9559 0.9633 0.9702 0.9768 0.9829 0.9889 0.9944 0.9991 0.9995 0.9998 1 1
# Plot PCA
pcaRUVg_k4$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
    left_join(dgeFalse_k4$samples) %>%
    left_join(metaData) %>%
    ggplot(
      aes_string(
        "PC1", "PC2", colour = "group", shape = "provider", label = "sex"
      )
    ) +
    geom_point(size = 3, stroke = 0.5) +
    # geom_text_repel(show.legend = FALSE) +
    labs(colour = "Family", shape = "Provider") +
    xlab(label = paste0("PC1", " (", percent(summary(pcaRUVg_k4)$importance[2, "PC1"]), ")")) +
    ylab(label = paste0("PC2", " (", percent(summary(pcaRUVg_k4)$importance[2, "PC2"]), ")")) +
    theme_bw()

k = 5

# Run RUVSeq for k = 5
RUVg_k5 <- RUVg(dgeFilt$counts, negControl, 5)

PCA

# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalse_k5 <- dgeFilt
# Replace with results
dgeFalse_k5$counts <- RUVg_k5$normalizedCounts
# Run PCA function
pcaRUVg_k5 <- dgeFalse_k5 %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVg_k5)$importance %>% pander()
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
Standard deviation 73.52 27.87 20.33 16.21 14.99 13.27 12.77 11.76 11.5 10.52 10.17 9.889 9.752 9.412 9.033 8.886 8.803 8.601 8.509 8.253 7.996 7.804 7.61 7.398 6.758 1.787 1.616 1.197 1.096 8.744e-14
Proportion of Variance 0.6052 0.08694 0.04629 0.02943 0.02515 0.0197 0.01826 0.01547 0.0148 0.01238 0.01157 0.01095 0.01065 0.00992 0.00914 0.00884 0.00868 0.00828 0.00811 0.00763 0.00716 0.00682 0.00648 0.00613 0.00511 0.00036 0.00029 0.00016 0.00013 0
Cumulative Proportion 0.6052 0.6921 0.7384 0.7678 0.793 0.8127 0.8309 0.8464 0.8612 0.8736 0.8852 0.8961 0.9068 0.9167 0.9258 0.9347 0.9433 0.9516 0.9597 0.9674 0.9745 0.9813 0.9878 0.9939 0.9991 0.9994 0.9997 0.9999 1 1
# Plot PCA
pcaRUVg_k5$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
    left_join(dgeFalse_k5$samples) %>%
    left_join(metaData) %>%
    ggplot(
      aes_string(
        "PC1", "PC2", colour = "group", shape = "provider", label = "sex"
      )
    ) +
    geom_point(size = 3, stroke = 0.5) +
    # geom_text_repel(show.legend = FALSE) +
    labs(colour = "Family", shape = "Provider") +
    xlab(label = paste0("PC1", " (", percent(summary(pcaRUVg_k5)$importance[2, "PC1"]), ")")) +
    ylab(label = paste0("PC2", " (", percent(summary(pcaRUVg_k5)$importance[2, "PC2"]), ")")) +
    theme_bw()

Animation

anim0 <- pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
  left_join(dgeFilt$samples) %>%
  left_join(metaData) %>%
  mutate(k = "before RUVSeq")
anim1 <- pcaRUVg_k1$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
  left_join(dgeFalse_k1$samples) %>%
  left_join(metaData) %>%
  mutate(
    k = "k = 1"
    # PC1 = -PC1,
    # PC2 = -PC2
    )
anim2 <- pcaRUVg_k2$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
  left_join(dgeFalse_k2$samples) %>%
  left_join(metaData) %>%
  mutate(
    k = "k = 2"
    # PC1 = -PC1,
    # PC2 = -PC2
    )
anim3 <- pcaRUVg_k3$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
  left_join(dgeFalse_k3$samples) %>%
  left_join(metaData) %>%
  mutate(
    k = "k = 3"
    # PC1 = -PC1,
    # PC2 = -PC2
    )
anim4 <- pcaRUVg_k4$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
  left_join(dgeFalse_k4$samples) %>%
  left_join(metaData) %>%
  mutate(
    k = "k = 4"
    # PC1 = -PC1,
    # PC2 = -PC2
    )
anim5 <- pcaRUVg_k5$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
  left_join(dgeFalse_k5$samples) %>%
  left_join(metaData) %>%
  mutate(
    k = "k = 5"
    # PC1 = -PC1,
    # PC2 = -PC2
    )
anim <- anim0 %>%
  full_join(anim1) %>%
  full_join(anim2) %>%
  full_join(anim3) %>%
  full_join(anim4) %>%
  full_join(anim5) 
anim %>%
  ggplot(
    aes_string(
      "PC1", "PC2", colour = "group", shape = "provider", label = "sex"
    )
  ) +
  geom_point(size = 3, stroke = 0.5) +
  labs(colour = "Family", shape = "Provider") +
  xlab(label = "PC1") +
  ylab(label = "PC2") +
  coord_cartesian(
    xlim = c(-150, 150),
    ylim = c(-150, 150)
    ) +
  transition_states(
    k,
    transition_length = 2,
    state_length = 3
  ) +
  ease_aes('cubic-in-out') +
  ggtitle("Now showing {closest_state}")

Gene removal

Fitting gene length and GC content as predictors of PC1 showed that genes of length bin 0.1-0.8 kb and gene GC content bin 52-67% were the biggest contributors. Therefore, it should be tested if pulling these genes out restores the PCA to represent more what is expected due to biological variation.

# Filter for genes that have short length and high GC content.
# To be used for subsetting filtered dgeList
geneFilter <- geneGCLen %>%
  mutate(
    lenTest = length > 800,
    gcTest = gc < 0.52
  ) %>%
  dplyr::filter(lenTest, gcTest)
# Subset filtered dgeList
dgeRemoved <- dgeFilt[geneFilter$gene_id,]

PCA

# Run PCA function
pcaRemoved <- dgeRemoved %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRemoved)$importance %>% pander()
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
Standard deviation 51.6 38.04 29.91 21.36 19.88 18.78 15.45 13.65 12.67 12.4 11.4 10.63 9.996 9.513 9.435 9.22 8.775 8.621 8.292 8.231 8.205 7.947 7.842 7.635 7.445 7.311 7.082 6.729 6.454 7.458e-14
Proportion of Variance 0.3193 0.1735 0.1073 0.05473 0.0474 0.0423 0.02861 0.02235 0.01925 0.01844 0.01559 0.01355 0.01198 0.01085 0.01067 0.01019 0.00923 0.00891 0.00824 0.00812 0.00807 0.00757 0.00737 0.00699 0.00665 0.00641 0.00601 0.00543 0.005 0
Cumulative Proportion 0.3193 0.4928 0.6001 0.6548 0.7022 0.7445 0.7731 0.7955 0.8147 0.8331 0.8487 0.8623 0.8743 0.8851 0.8958 0.906 0.9152 0.9241 0.9324 0.9405 0.9486 0.9561 0.9635 0.9705 0.9771 0.9836 0.9896 0.995 1 1
# Plot PCA
pcaRemoved$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
    left_join(dgeRemoved$samples) %>%
    left_join(metaData) %>%
    ggplot(
      aes_string(
        "PC1", "PC2", colour = "group", shape = "provider", label = "sex"
      )
    ) +
    geom_point(size = 3, stroke = 0.5) +
    # geom_text_repel(show.legend = FALSE) +
    labs(colour = "Family", shape = "Provider") +
    xlab(label = paste0("PC1", " (", percent(summary(pcaRemoved)$importance[2, "PC1"]), ")")) +
    ylab(label = paste0("PC2", " (", percent(summary(pcaRemoved)$importance[2, "PC2"]), ")")) +
    theme_bw()

This was also tried with gc < 0.44, but very little difference between the two PCA’s was observed.